*Get life-time earnings*
clear

use "Data\sample1_new.dta"

drop year1 year2 year3 year4 year5 year9 year10 year11 start_wage_ambition ten_wage_ambition old_ambition individual_wage_growth_ambition extreme_ambition wage10_ambition_temp n_wage10_ambition_group wage_start_ambition_temp n_wage_start_ambition_group wage10_ambition wage_start_ambition n_wage10_individual_ambition n_wage_start_individual_ambition wage10_mean_ambition wage_start_mean_ambition wage_growth_ambition

*Save and merge on later
save "Results\fig_A2\temp.dta", replace


*identify those we observe for 30 years after graduation
gen temp=1 if final_educ_year==aar
bysort pnr: egen temp2=max(temp)
gen year1=.
replace year1=final_educ_year if final_educ_year>=1980 & final_educ_year!=. & temp2==1
drop temp temp2
gen year30=year1+29 if year1!=.
gen temp=1 if aar==year30 & year1!=.
bysort pnr: egen obs_for_30=max(temp)
drop temp
gen year_since_grad=aar-year1+1
replace year_since_grad=. if year_since_grad<1 | year_since_grad>30

**Get life time wages

gen log_earnings=log(erhvervsindk_13)

*drop more variables to speed things up
keep final_educ pnr aar log_earnings year_since_grad obs_for_30 individual grad_region


*before going to program level
gen earnings_deflated=.
gen life_earnings=.

sort final_educ pnr aar

*copy missing final_educ to output folder
frame copy default temp 
frame change temp
keep if final_educ==1
frame copy temp output
frame change default
drop if final_educ==1
frame drop temp

frame copy default temp 
frame change temp
keep if final_educ==.
frame change output
frameappend temp
frame change default
drop if final_educ==.
frame drop temp

*check if some programs have too few people for 30 years. We want at least 30 individuals for meaningful regressions
by final_educ: egen temp=sum(obs_for_30) if individual==1
by final_educ: egen temp2=max(temp)

frame copy default temp 
frame change temp
keep if temp2<50
drop temp
frame change output
frameappend temp
frame change default
drop if temp2<50
drop temp temp2
frame drop temp


*Program level

levelsof final_educ, matrow(programs)

mat dir /*dimensions are  programs[444,1]*/

/*Loop takes around 24 hours to run*/

forvalues j=1(1)444{

frame copy default temp 
frame change temp
keep if final_educ==programs[`j',1]

qui reg log_earnings ib2000.aar i.year_since_grad if obs_for_30==1 & final_educ==programs[`j',1]

gen temp=0 if aar==2000 & obs_for_30==1


local p: colfullnames e(b)

foreach i of local p{
	
if substr("`i'",5,1)=="." {
	replace temp=_b[`=substr("`i'",1,4)'.aar] if aar==`=substr("`i'",1,4)' & obs_for_30==1
	}
}


replace earnings_deflated=log_earnings-temp if !missing(year_since_grad) & obs_for_30==1 & final_educ==programs[`j',1]

drop temp

bysort pnr: egen temp=sum(earnings_deflated) if final_educ==programs[`j',1]

replace life_earnings=temp
replace life_earnings=. if life_earnings==0

drop temp

frame change output
frameappend temp
frame change default
drop if final_educ==programs[`j',1]
frame drop temp

}

frame change output
drop temp2

*Save and continue from here*
save "Results\fig_A2\temp2.dta", replace

use "Results\fig_A2\temp2.dta"

*Group by final_educ*
sort final_educ

*get group variables

by final_educ: egen temp=mean(life_earnings) if final_educ!=. & final_educ!=1 & individual==1  
by final_educ: egen p_life_earnings=max(temp)
drop temp



*Fix problem with some large "out-dated" educational programs


forvalues i=81(1)85{
	
*9th grade
gen temp=p_life_earnings if final_educ==1109`i'
egen temp_2=max(temp)
replace p_life_earnings=temp_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp temp_2

replace final_educ=1107`i' if final_educ==1107 & grad_region==`i'
replace final_educ=1008`i' if final_educ==1008 & grad_region==`i'
replace final_educ=1023`i' if final_educ==1023 & grad_region==`i'
replace final_educ=1123`i' if final_educ==1123 & grad_region==`i'
replace final_educ=1009`i' if final_educ==1009 & grad_region==`i'
replace final_educ=1022`i' if final_educ==1022 & grad_region==`i'

*10th grade
gen temp=p_life_earnings if final_educ==1110`i'
egen temp_2=max(temp)
replace p_life_earnings=temp_2 if final_educ==1010 & grad_region==`i' 
drop temp temp_2

replace final_educ=1010`i' if final_educ==1010 & grad_region==`i'

}

*3.g 
gen temp=p_life_earnings if final_educ==1198
egen temp_2=max(temp)
replace p_life_earnings=temp_2 if final_educ==1097
drop temp temp_2


**kmeans**
*standardize variables
sum p_life_earnings
sca the_mean_s=r(mean)
sca the_sd_s=r(sd)
gen p_life_earnings_s=(p_life_earnings-the_mean_s)/the_sd_s
sum p_life_earnings_s


cluster kmeans p_life_earnings_s, k(4) name(life_earnings_ambition) s(kr(1234))
tab life_earnings_ambition

tabstat p_life_earnings_s, by(life_earnings_ambition)


*SAVE*
save "Results\fig_A2\life_earnings_ambition_updated.dta", replace

preserve

keep final_educ p_life_earnings_s life_earnings_ambition

bys final_educ: keep if _n==1

save "Results\fig_A2\final_educ_lifetime_earnings_updated_moments_standardized.dta"
restore
